{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "98abe672",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-1-8ae4025e7ff4>:19: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ddce0b8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "69770c87",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击使用账号、密码登录\n",
    "element = driver.find_element_by_xpath('//a[@class=\"login__type__container__select-type\"]')\n",
    "# element = driver.find_element_by_xpath('//a[@class=\"login__type__container__select-type\"]')\n",
    "# 不要直接 click（） 等相关操作，首先要检查是否通过 xpath 找的正确的element\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "362d41b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 账号、密码信息\n",
    "payload =  {\"account\": \"请输入账号\", \"password\": \"请输入密码\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0821d684",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入账号\n",
    "element = driver.find_element_by_xpath('//input[@name=\"account\"]')\n",
    "element.clear()\n",
    "element.send_keys(payload['account'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c0b09089",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输入密码\n",
    "element = driver.find_element_by_xpath('//input[@name=\"password\"]')\n",
    "element.clear()\n",
    "element.send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "df291b52",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 登录跳转\n",
    "element = driver.find_element_by_xpath('//a[@class=\"btn_login\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b7fe58a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 展开\n",
    "element = driver.find_element_by_xpath('//a[@id=\"m_open\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "8cb28c4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击图文素材\n",
    "element = driver.find_element_by_xpath('/html/body/div[4]/div[2]/ul/li[2]/ul/li[1]/a') \n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "e0f506b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击 \"+\"  新的创作\n",
    "element = driver.find_element_by_xpath('//i[@class=\"weui-desktop-card__icon-add\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "035b368e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# “写新图文” —>  链接跳转\n",
    "element = driver.find_element_by_xpath('//a//i[@class=\"icon-svg-editor-appmsg\"]') \n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "cf599257",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-BDBCF91FAFAD2C58987B9381ECABC4FB',\n",
       " 'CDwindow-7AA5E1417786CC2A43B8AC88ECC6F630']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 两个窗口下 进行窗口定位 \n",
    "# 窗口信息检查（>1）\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "bafc4996",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-20-6c6d5ce6602d>:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a45785bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击超链接\n",
    "element = driver.find_element_by_xpath('//li[@id=\"js_editor_insertlink\"]') \n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "407d7854",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击选择其它公众号\n",
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-btn weui-desktop-btn_default\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "267b17c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "公众号 = \"广州吃喝玩乐\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "11ad4da3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# input 输入关键词\n",
    "element = driver.find_element_by_xpath('//input[@placeholder=\"输入文章来源的公众号名称或微信号，回车进行搜索\"]')\n",
    "element.get_attribute('innerHTML')\n",
    "element.clear()\n",
    "element.send_keys(公众号)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "8903cbad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__search weui-desktop-icon__small\" style=\"width: 20px; height: 20px;\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!---->     <svg viewBox=\"0 0 24 24\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\"><title>MP/Icon/Search</title> <g id=\"MP/Icon/Search\" stroke=\"none\" stroke-width=\"1\" fill=\"none\" fill-rule=\"evenodd\"><path d=\"M5.78025253,5.78248558 C8.51392257,3.04881554 12.9460774,3.04881554 15.6797475,5.78248558 C18.1730922,8.27583028 18.3922898,12.1821488 16.3373403,14.9239313 L20.6294949,19.2175144 L19.2152814,20.631728 L14.922508,16.3389663 C12.180685,18.394566 8.27384272,18.1755707 5.78025253,15.6819805 C3.04658249,12.9483105 3.04658249,8.51615562 5.78025253,5.78248558 Z M6.8409127,6.84314575 C4.6930291,8.99102935 4.6930291,12.4734367 6.8409127,14.6213203 C8.98879631,16.7692039 12.4712037,16.7692039 14.6190873,14.6213203 C16.7669709,12.4734367 16.7669709,8.99102935 14.6190873,6.84314575 C12.4712037,4.69526215 8.98879631,4.69526215 6.8409127,6.84314575 Z\" id=\"形状\"></path></g></svg> <!----> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-icon-btn weui-desktop-search__btn\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "c35f8401",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/CbiahvU88bqrF0bw6qMxmFCHuZfaIQnowiaBfZVCdIHlWicLR1NRs2eOYrNdyHC3tMJLzbXw4KhSzj4FsS3XIptLQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：gz0020</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/Jxsw2zUlLjWuf3I6PpntLKwCib1sZaV4icNX2umGYnYENcrZY5H9Z4VZshF1M5pWeaGL3IIysrZHAbRAKiceuOBSA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐蒲</strong> <i class=\"inner_link_account_wechat\">微信号：pupu020</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/DUpXjQUpR2ZZ1x8iajmZP4Nbx4UggbnVzSqY4icKjRRzyDqY8L4w6Q6cJdhcer5W0da44D94Agk6g3CqqpJt1msA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：henrygg3</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/FfvfsOKMFWYZ3vPpMjdNpZP2riaXyhQbt6jPo74HxrqvzDljib1EfBed3RmbQmeLWxH0eAiapziaicBrricHibhgpdLOw/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：gzchwl666</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/7mkTEJgkBib9GeedpMVsiajuw21iacSz8sNFpDhbxwqQ0ySs7T1CVv8EFomHkOwf74dVFncPqKv6uAgaEUnKaZBAQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：gzchwlgo</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/HRqtPHB5O2KlKKqZgX3FibFdY8tlk601nUlxZ0OvLibnc4VFyHhicwu8BK38JzGGRmzZXdia9zEy4uUkNfjCDo88ww/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：ChwlInGuangZhou</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/Kd8R7WEGPVzibrbSpibuWO06D40ySzpYkGoYJwcHJqh9ltNT1sd7vgJGicT8081NJTNLE4APklmnjlElPITRIb8zw/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：GZ_chihewanle</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/gmhSazQYu0kUsdicyAiaepiaYdI7ww1Z3ict9C9IDzQuGfNxgERltg1DQZic5zHVVricxNNgHXmwZqu8cdXsRwNfjCaQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：gzchwlei</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/ZKiapRichNrXj70HiaLVKsp9LUw4ByheLFWAicyFqkOaZneEHUoSO4kbicNevMZGaNF8M0po2XnpHLUzaxibn0EEk2ew/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：gzchwl8</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/Au5bEoXgTIbAM6I4DX8VIKgUc0AtkZtw2VtlEFiaoQIicZIribADe1J55F7ac7xhNBsrBAoMP9WJp13k5GKjiaZHww/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：guanzhoux</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/uic1icR0u6ghLlXkeYS78ricNQibkibib3WMF1icEVpYv7uQicF0NxeRaqUeXXOpYCLH2CGPhBoSUWK8XxM8593wJg1S9Q/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：ggzz55</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/HrRYRjeq1c0pMoBSZfKHTsoFAicgbp4eUza0NpQUMAD64p8xtWwtaHSlCziboQaTX2wiczsIX81gDO6UbeZib7SVOg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐生活</strong> <i class=\"inner_link_account_wechat\">微信号：gzlive020</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/pcr6xY7ybBtEQjwAicCiclcicagN0icgDszKhicMJ385IOibkLtqx75YwOHWrk2xNhoNZ4GQyd7PZEtrDeJkaRCFUlqg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州市吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/yY7qdw89ZhgfZXTx3NJwNOt8AQGx9aw8TEMw8P1yiaS0lpRrtEhLy69FN87If4qFwcKC7KLrFlQx2GKIFDMrtpg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐商城</strong> <i class=\"inner_link_account_wechat\">微信号：chwlj020</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/Cz5nhgFqmuV6tOxRfibPjMRtXWCZrrxXjK7pmARDYC7o05LfZibTkmAUO4d91fkibhKia8BsAEV1LTBoghU2p5o4qA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐达人</strong> <i class=\"inner_link_account_wechat\">微信号：daren020</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/icq6ARaS6rCpoTSc7xJbiahXHNcmm3XWRCXxrEjicia6HEgkpTcNo8v9XaFH1wrZJoajvrnF5Sv8iay1fhGbbENhlhw/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐公司</strong> <i class=\"inner_link_account_wechat\">微信号：chwlgs020</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/dtPGfkhUfVdwpcXbXNfodqMoVCqdF2ricq7s9TTiaqBQHKtw2fxPAQQHFNWVSJicZgKSjhtNKPmFia2hLHn5SsunVg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐IN白云</strong> <i class=\"inner_link_account_wechat\">微信号：GZbaiyun33</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/6icyYJbwdiaMibUZYIOgcERW9EAvFicfmneM4vZsG8XHOut1RUMxKTspn7UwdBODksy8HQFsLicc9StVuMNIMRia492A/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐满FUN</strong> <i class=\"inner_link_account_wechat\">微信号：INgzfun</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/kytgX6QsK8GY0y5ibHtp2L6d6xkYFMZibWGZPRv9IGia99ibQb6oiaOicMicsBcRAquRSV8QESn0hnRM6sJczs14N9O4g/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐鸭</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/MDykvHscrMMofL6iczQRiaBw8ptGqy94kibU772KOHDos56R123jicaYCLibdM6mH5c6QibHfP8wlajMk9BsR0uQFENA/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州美食吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：GZ-chwl</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">服务号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "01a93c28",
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = { \"output\" : { \"公众号_htm_snippets\": \"_data_raw_src/公众号_htm_snippets_{公众号}.tsv\",\n",
    "                    \"公众号_df\": \"_data_raw_src/公众号_df_{公众号}.tsv\",\n",
    "                    \"公众号_xlsx\": \"data_sets/公众号_url_{公众号}.xlsx\" } \\\n",
    "      }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "d70fc639",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "4006126f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 解析\n",
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "3e374ab8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>广州吃喝玩乐</td>\n",
       "      <td>微信号：gz0020</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/CbiahvU88bqrF0b...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>广州吃喝玩乐蒲</td>\n",
       "      <td>微信号：pupu020</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/Jxsw2zUlLjWuf3I...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>广州吃喝玩乐</td>\n",
       "      <td>微信号：henrygg3</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/DUpXjQUpR2ZZ1x8...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>广州吃喝玩乐</td>\n",
       "      <td>微信号：gzchwl666</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/FfvfsOKMFWYZ3vP...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>广州吃喝玩乐</td>\n",
       "      <td>微信号：gzchwlgo</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/7mkTEJgkBib9Gee...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>广州吃喝玩乐</td>\n",
       "      <td>微信号：ChwlInGuangZhou</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/HRqtPHB5O2KlKKq...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>广州吃喝玩乐</td>\n",
       "      <td>微信号：GZ_chihewanle</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/Kd8R7WEGPVzibrb...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>广州吃喝玩乐</td>\n",
       "      <td>微信号：gzchwlei</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/gmhSazQYu0kUsdi...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>广州吃喝玩乐</td>\n",
       "      <td>微信号：gzchwl8</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/ZKiapRichNrXj70...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>广州吃喝玩乐</td>\n",
       "      <td>微信号：guanzhoux</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/Au5bEoXgTIbAM6I...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>广州吃喝玩乐</td>\n",
       "      <td>微信号：ggzz55</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/uic1icR0u6ghLlX...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>广州吃喝玩乐生活</td>\n",
       "      <td>微信号：gzlive020</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/HrRYRjeq1c0pMoB...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>广州市吃喝玩乐</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/pcr6xY7ybBtEQjw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>广州吃喝玩乐商城</td>\n",
       "      <td>微信号：chwlj020</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/yY7qdw89ZhgfZXT...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>广州吃喝玩乐达人</td>\n",
       "      <td>微信号：daren020</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/Cz5nhgFqmuV6tOx...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>广州吃喝玩乐公司</td>\n",
       "      <td>微信号：chwlgs020</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/icq6ARaS6rCpoTS...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>广州吃喝玩乐IN白云</td>\n",
       "      <td>微信号：GZbaiyun33</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/dtPGfkhUfVdwpcX...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>广州吃喝玩乐满FUN</td>\n",
       "      <td>微信号：INgzfun</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/6icyYJbwdiaMibU...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>广州吃喝玩乐鸭</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/kytgX6QsK8GY0y5...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>广州美食吃喝玩乐</td>\n",
       "      <td>微信号：GZ-chwl</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/MDykvHscrMMofL6...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      nickname               wechat  \\\n",
       "0       广州吃喝玩乐           微信号：gz0020   \n",
       "1      广州吃喝玩乐蒲          微信号：pupu020   \n",
       "2       广州吃喝玩乐         微信号：henrygg3   \n",
       "3       广州吃喝玩乐        微信号：gzchwl666   \n",
       "4       广州吃喝玩乐         微信号：gzchwlgo   \n",
       "5       广州吃喝玩乐  微信号：ChwlInGuangZhou   \n",
       "6       广州吃喝玩乐    微信号：GZ_chihewanle   \n",
       "7       广州吃喝玩乐         微信号：gzchwlei   \n",
       "8       广州吃喝玩乐          微信号：gzchwl8   \n",
       "9       广州吃喝玩乐        微信号：guanzhoux   \n",
       "10      广州吃喝玩乐           微信号：ggzz55   \n",
       "11    广州吃喝玩乐生活        微信号：gzlive020   \n",
       "12     广州市吃喝玩乐              微信号：未设置   \n",
       "13    广州吃喝玩乐商城         微信号：chwlj020   \n",
       "14    广州吃喝玩乐达人         微信号：daren020   \n",
       "15    广州吃喝玩乐公司        微信号：chwlgs020   \n",
       "16  广州吃喝玩乐IN白云       微信号：GZbaiyun33   \n",
       "17  广州吃喝玩乐满FUN          微信号：INgzfun   \n",
       "18     广州吃喝玩乐鸭              微信号：未设置   \n",
       "19    广州美食吃喝玩乐          微信号：GZ-chwl   \n",
       "\n",
       "                                                  img  \n",
       "0   http://mmbiz.qpic.cn/mmbiz_png/CbiahvU88bqrF0b...  \n",
       "1   http://mmbiz.qpic.cn/mmbiz_png/Jxsw2zUlLjWuf3I...  \n",
       "2   http://mmbiz.qpic.cn/mmbiz_png/DUpXjQUpR2ZZ1x8...  \n",
       "3   http://mmbiz.qpic.cn/mmbiz_png/FfvfsOKMFWYZ3vP...  \n",
       "4   http://mmbiz.qpic.cn/mmbiz_png/7mkTEJgkBib9Gee...  \n",
       "5   http://mmbiz.qpic.cn/mmbiz_png/HRqtPHB5O2KlKKq...  \n",
       "6   http://mmbiz.qpic.cn/mmbiz_png/Kd8R7WEGPVzibrb...  \n",
       "7   http://mmbiz.qpic.cn/mmbiz_png/gmhSazQYu0kUsdi...  \n",
       "8   http://mmbiz.qpic.cn/mmbiz_png/ZKiapRichNrXj70...  \n",
       "9   http://mmbiz.qpic.cn/mmbiz_png/Au5bEoXgTIbAM6I...  \n",
       "10  http://mmbiz.qpic.cn/mmbiz_png/uic1icR0u6ghLlX...  \n",
       "11  http://mmbiz.qpic.cn/mmbiz_png/HrRYRjeq1c0pMoB...  \n",
       "12  http://mmbiz.qpic.cn/mmbiz_png/pcr6xY7ybBtEQjw...  \n",
       "13  http://mmbiz.qpic.cn/mmbiz_png/yY7qdw89ZhgfZXT...  \n",
       "14  http://mmbiz.qpic.cn/mmbiz_png/Cz5nhgFqmuV6tOx...  \n",
       "15  http://mmbiz.qpic.cn/mmbiz_png/icq6ARaS6rCpoTS...  \n",
       "16  http://mmbiz.qpic.cn/mmbiz_png/dtPGfkhUfVdwpcX...  \n",
       "17  http://mmbiz.qpic.cn/mmbiz_png/6icyYJbwdiaMibU...  \n",
       "18  http://mmbiz.qpic.cn/mmbiz_png/kytgX6QsK8GY0y5...  \n",
       "19  http://mmbiz.qpic.cn/mmbiz_png/MDykvHscrMMofL6...  "
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)\n",
    "df_account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "0f3080b7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/CbiahvU88bqrF0bw6qMxmFCHuZfaIQnowiaBfZVCdIHlWicLR1NRs2eOYrNdyHC3tMJLzbXw4KhSzj4FsS3XIptLQ/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">广州吃喝玩乐</strong> <i class=\"inner_link_account_wechat\">微信号：gz0020</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]/li')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "00e1d813",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n跳转_input = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/input\\')\\n跳转_a = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/a\\')\\n跳转_title = driver.find_element_by_xpaht(\\'//div[@class=\"inner_link_article_title\"]//span//text()\\')\\n跳转_input.clear()\\n跳转_input.send_keys(2)\\n跳转_a.click()\\n'"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 跳转testing\n",
    "'''\n",
    "跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "跳转_title = driver.find_element_by_xpaht('//div[@class=\"inner_link_article_title\"]//span//text()')\n",
    "跳转_input.clear()\n",
    "跳转_input.send_keys(2)\n",
    "跳转_a.click()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "2efc1b69",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 184]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# 跳转上限\n",
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "9e900562",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],l_e_int[-1]+1 ))\n",
    "#print(pages[0:2])\n",
    "pages = list(range(1,l_e_int[-1]+1 ))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "1f0f0d00",
   "metadata": {},
   "outputs": [],
   "source": [
    "pages = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "7b2025c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "a87903d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(45+120*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "aed20c9b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t51\t"
     ]
    }
   ],
   "source": [
    "process_pages (pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "c2c52b0d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       html_snippets\n",
       "1  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df.loc[0:1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "886e2c03",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "28087016",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "50\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div class=\"weui-desktop-radio-group\"&gt;&lt;label c...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "12  <div class=\"weui-desktop-radio-group\"><label c..."
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df.duplicated()  默认所有列，无重复记录  【duplicated()函数】判断是否有重复项\n",
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "ab931b77",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[12]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[12]"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "0e758897",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 暂存档\n",
    "filename = fn [\"output\"] [\"公众号_htm_snippets\"] \n",
    "df_out.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "c8a0600a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "15,17,16,14,20,25,13,15,18,21,17,17,22,19,17,14,14,13,9,10,15,9,7,9,9,8,9,10,8,9,9,9,8,9,10,10,10,9,10,9,8,5,9,9,8,9,8,9,9,7,9,"
     ]
    }
   ],
   "source": [
    "from requests_html import HTMLSession\n",
    "def get_content(link):\n",
    "    session = HTMLSession()\n",
    "    r = session.get(url=link)\n",
    "    content_xpath_1 = '//*[@id=\"js_content\"]//span/text()'\n",
    "    content_xpath_2 = '//*[@id=\"js_content\"]//p/text()'\n",
    "    content_1 = ''.join(r.html.xpath(content_xpath_1))\n",
    "    content_2 = ''.join(r.html.xpath(content_xpath_2))\n",
    "    return content_1 + content_2\n",
    "\n",
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x.text for x in root.xpath('//div[@class=\"inner_link_article_title\"]//span[2]')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    content_text = [get_content(x) for x in link]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\": create_time, \"link\":link, \"content_text\":content_text})\n",
    "    return(_df_)\n",
    "    \n",
    "l_df = []\n",
    "for p in pages:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "bbe786b4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>放假通知！再上4天班，广州人集体放假！看到最后我哭了…</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>现在，小编要告诉大家一个好消息，还有一个坏消息大家想要听哪一个呢？好吧，小编先说一个坏消息2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>员村 | 江边3500㎡水上乐园，¥29.9起1大1小！周末节假日通用</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>前500套=29.9元（门市价：100元）500套后=39.9元（门市价：100元）使用日期...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>北京路音乐餐吧118元享双人餐！西冷牛扒+黑毛猪扒+脆皮烤鸡…张口就回本！</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>1928音乐餐厅｜浪漫西餐  套餐包含 扫码马上抢购 /// 本期导航/// 1、复古情调...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>山东红灯樱桃78元抢购3斤！个大核小，肉厚多汁！果园直发</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>初夏5-6月正是吃樱桃的最好时节忍不住来一句“红了樱桃，绿了芭蕉。”听说山东临沂的红灯樱桃这...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>涨涨涨！广州5月最新房价出炉！这个区年内涨幅最大！</td>\n",
       "      <td>2021-05-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>即将进入2021上半年的最后一个月你们定好今年要实现的目标了吗？都准备在广州盘几套房？前段时...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>597</th>\n",
       "      <td>夏日带娃不用愁！精致妈妈的免洗护发神器！3分钟养出滋润秀发！</td>\n",
       "      <td>2020-08-15</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>对于很多女孩子来说，洗头不只是洗头，而是“洗头护发吹干梳理”这样一连串的漫~长过程，分分钟要...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>598</th>\n",
       "      <td>20+款解暑凉拌菜！这家10㎡的广式凉拌店，专治没胃口！</td>\n",
       "      <td>2020-08-15</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>这些日子里，我们吃饭天团美食放在面前都没了动筷的激情原因是：太热了，晒烦了，吃不下了 直到瓜...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>599</th>\n",
       "      <td>99元配防蓝光眼镜！千款镜框任你选，高空下午茶免费叹！</td>\n",
       "      <td>2020-08-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>对近视的盆友来说  这真的是难得一遇的大优惠啊这羊毛不薅就亏了！▼▼▼我们的老朋友一点零视光...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>600</th>\n",
       "      <td>北方人：“来广州第一次去菜市场，我被感动哭了...”</td>\n",
       "      <td>2020-08-13</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>一方水土养一方人，南方湿冷VS北方干冷、咸甜之争、搓澡VS洗澡...有关南北两地差异的纷争从...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>601</th>\n",
       "      <td>人均20！藏在永庆坊的甜品火锅，上桌冒着仙气，椰奶免费续！</td>\n",
       "      <td>2020-08-13</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>■  ■  ■  /  夏天，糖水和冰更配  / 噔噔蹬...甜品控赶紧看过来我又有新发现...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>602 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     title create_time  \\\n",
       "0              放假通知！再上4天班，广州人集体放假！看到最后我哭了…  2021-05-25   \n",
       "1      员村 | 江边3500㎡水上乐园，¥29.9起1大1小！周末节假日通用  2021-05-25   \n",
       "2    北京路音乐餐吧118元享双人餐！西冷牛扒+黑毛猪扒+脆皮烤鸡…张口就回本！  2021-05-25   \n",
       "3             山东红灯樱桃78元抢购3斤！个大核小，肉厚多汁！果园直发  2021-05-25   \n",
       "4                涨涨涨！广州5月最新房价出炉！这个区年内涨幅最大！  2021-05-24   \n",
       "..                                     ...         ...   \n",
       "597         夏日带娃不用愁！精致妈妈的免洗护发神器！3分钟养出滋润秀发！  2020-08-15   \n",
       "598           20+款解暑凉拌菜！这家10㎡的广式凉拌店，专治没胃口！  2020-08-15   \n",
       "599            99元配防蓝光眼镜！千款镜框任你选，高空下午茶免费叹！  2020-08-14   \n",
       "600             北方人：“来广州第一次去菜市场，我被感动哭了...”  2020-08-13   \n",
       "601          人均20！藏在永庆坊的甜品火锅，上桌冒着仙气，椰奶免费续！  2020-08-13   \n",
       "\n",
       "                                                  link  \\\n",
       "0    http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "1    http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "2    http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "3    http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "4    http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "..                                                 ...   \n",
       "597  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "598  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "599  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "600  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "601  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "\n",
       "                                          content_text  \n",
       "0    现在，小编要告诉大家一个好消息，还有一个坏消息大家想要听哪一个呢？好吧，小编先说一个坏消息2...  \n",
       "1    前500套=29.9元（门市价：100元）500套后=39.9元（门市价：100元）使用日期...  \n",
       "2     1928音乐餐厅｜浪漫西餐  套餐包含 扫码马上抢购 /// 本期导航/// 1、复古情调...  \n",
       "3    初夏5-6月正是吃樱桃的最好时节忍不住来一句“红了樱桃，绿了芭蕉。”听说山东临沂的红灯樱桃这...  \n",
       "4    即将进入2021上半年的最后一个月你们定好今年要实现的目标了吗？都准备在广州盘几套房？前段时...  \n",
       "..                                                 ...  \n",
       "597  对于很多女孩子来说，洗头不只是洗头，而是“洗头护发吹干梳理”这样一连串的漫~长过程，分分钟要...  \n",
       "598  这些日子里，我们吃饭天团美食放在面前都没了动筷的激情原因是：太热了，晒烦了，吃不下了 直到瓜...  \n",
       "599  对近视的盆友来说  这真的是难得一遇的大优惠啊这羊毛不薅就亏了！▼▼▼我们的老朋友一点零视光...  \n",
       "600  一方水土养一方人，南方湿冷VS北方干冷、咸甜之争、搓澡VS洗澡...有关南北两地差异的纷争从...  \n",
       "601   ■  ■  ■  /  夏天，糖水和冰更配  / 噔噔蹬...甜品控赶紧看过来我又有新发现...  \n",
       "\n",
       "[602 rows x 4 columns]"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "794cf001",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>191</th>\n",
       "      <td>99元秒原价260元2-3人套餐！火爆全网的盅盅火锅菜品低至2元，超抵食！</td>\n",
       "      <td>2021-03-31</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>工具条上设置固定宽高背景可以设置被包含可以完美对齐背景图和文字以及制作自己的模板工具条上设置...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>192</th>\n",
       "      <td>微信重磅更新！30秒视频，999个表情包！还有...</td>\n",
       "      <td>2021-03-31</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>微信 iOS 版更新 8.0.3 版表情包上限增加了999个！朋友圈可发30s视频！.......</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>193</th>\n",
       "      <td>百乐门喜宴￥78元粤式点心2-3人餐！￥169元抢​海陆4人餐，节假日通用！</td>\n",
       "      <td>2021-03-31</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>抢购日期：即日-2021年4月15日2021年3月26日确认短信：购买成功后即可收到含电子码...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>194</th>\n",
       "      <td>踩屎感爆棚的“气垫拖鞋”！全掌按摩，防滑耐磨易清洗，每走一步都是享受！</td>\n",
       "      <td>2021-03-31</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>在外辛苦奔波一整天全身上下疲惫不堪，而鞋子又闷又挤，双脚又酸又胀回到家后，第一件事就是换上一...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>195</th>\n",
       "      <td>SPAAARK超级乐园潮玩馆，49.9元双人VR游戏玩足2小时！</td>\n",
       "      <td>2021-03-31</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>你想360°沉浸体验游戏吗？你想视、听、触觉全感官刺激么？你想上天入地，穿越古今么？眼见并非...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>196</th>\n",
       "      <td>荔湾广场宣布升级改造项目动工！网友：边个咁大胆啊？！</td>\n",
       "      <td>2021-03-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>谈起广州的购物娱乐好去处大家立刻会想到各大商圈但唯独有一个商业广场广州人谈之色变，望而却步没...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>197</th>\n",
       "      <td>补牙38元一只！69.9元高品质洗牙！限门店新用户参与</td>\n",
       "      <td>2021-03-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>现代人每天都会刷牙但是刷牙的时长和姿势不对时间一长，口腔里不仅细菌肆意滋生还会导致各种口腔疾...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198</th>\n",
       "      <td>珠江新城酒店自助餐199元起2大1小！美食狂欢~叹环球海鲜！</td>\n",
       "      <td>2021-03-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>周末节假可用！免停车3小时！珠江新城性价比zui高自助餐！确认短信：购买成功后即可收到含电子...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199</th>\n",
       "      <td>人均30+元上广州塔看电影！全国金逸影城任意厅通兑！</td>\n",
       "      <td>2021-03-30</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>购票软件上看IMAX厅电影1张就要80+这也太贵了吧？？吓得小编默默捂紧钱包还是来看看粉丝福...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>200</th>\n",
       "      <td>下一站，厦滘！10000+广漂族梦开始的地方！</td>\n",
       "      <td>2021-03-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>· 这，就是厦滘  · 厦滘，位于番禺区沙滘岛的中部，占地面积1平方公里，别看面积不大...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>201</th>\n",
       "      <td>1元买生活用品！快来领叮当快药大促好礼</td>\n",
       "      <td>2021-03-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202</th>\n",
       "      <td>云南香水小菠萝19.9元5斤！ 清爽脆甜多汁，肉细腻无渣！买即送菠萝刀</td>\n",
       "      <td>2021-03-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>又到了黄灿灿的菠萝风靡大街的季节啦！！！菠萝迷们可以敞开了吃了！吃过那么多菠萝，你吃过云南香...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>203</th>\n",
       "      <td>顺德长鹿旅游休博园，169元抢2大2小特惠套票！</td>\n",
       "      <td>2021-03-29</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>抢购价：169元/套（门市价400元）抢购日期：即日-2021年3月31日使用日期：购买次日...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>204</th>\n",
       "      <td>周末优惠来啦！好吃的好玩的都给你们准备好啦～\\n\\n渔民新村￥88抢点心3-4人餐！￥168...</td>\n",
       "      <td>2021-03-28</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>205</th>\n",
       "      <td>《广东最难懂的方言排名榜》出炉！第一名竟然是……</td>\n",
       "      <td>2021-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>外省的朋友们经常觉得粤语好难但是在广州这城市每个人讲的方言都有可能不一样就算同样是广东人也有...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>206</th>\n",
       "      <td>市二宫电影票39.9元2张+爆米花1份！2D3D通兑，周末不加价</td>\n",
       "      <td>2021-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>看电影还是得去电影院哪！之所以喜欢去还是因为电影院有的仪式感尤其对于影迷们来说出色的视听效果...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>207</th>\n",
       "      <td>电玩城仅39.9元抢购100枚游戏币！地铁直达，全场任玩，找回儿时快乐！</td>\n",
       "      <td>2021-03-27</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>抢购价：39.9元/套（门市价150元）使用日期：即日-2021年4月15日确认短信：购买成...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 title create_time  \\\n",
       "191              99元秒原价260元2-3人套餐！火爆全网的盅盅火锅菜品低至2元，超抵食！  2021-03-31   \n",
       "192                         微信重磅更新！30秒视频，999个表情包！还有...  2021-03-31   \n",
       "193             百乐门喜宴￥78元粤式点心2-3人餐！￥169元抢​海陆4人餐，节假日通用！  2021-03-31   \n",
       "194                踩屎感爆棚的“气垫拖鞋”！全掌按摩，防滑耐磨易清洗，每走一步都是享受！  2021-03-31   \n",
       "195                   SPAAARK超级乐园潮玩馆，49.9元双人VR游戏玩足2小时！  2021-03-31   \n",
       "196                         荔湾广场宣布升级改造项目动工！网友：边个咁大胆啊？！  2021-03-30   \n",
       "197                        补牙38元一只！69.9元高品质洗牙！限门店新用户参与  2021-03-30   \n",
       "198                     珠江新城酒店自助餐199元起2大1小！美食狂欢~叹环球海鲜！  2021-03-30   \n",
       "199                         人均30+元上广州塔看电影！全国金逸影城任意厅通兑！  2021-03-30   \n",
       "200                            下一站，厦滘！10000+广漂族梦开始的地方！  2021-03-29   \n",
       "201                                1元买生活用品！快来领叮当快药大促好礼  2021-03-29   \n",
       "202                云南香水小菠萝19.9元5斤！ 清爽脆甜多汁，肉细腻无渣！买即送菠萝刀  2021-03-29   \n",
       "203                           顺德长鹿旅游休博园，169元抢2大2小特惠套票！  2021-03-29   \n",
       "204  周末优惠来啦！好吃的好玩的都给你们准备好啦～\\n\\n渔民新村￥88抢点心3-4人餐！￥168...  2021-03-28   \n",
       "205                           《广东最难懂的方言排名榜》出炉！第一名竟然是……  2021-03-27   \n",
       "206                   市二宫电影票39.9元2张+爆米花1份！2D3D通兑，周末不加价  2021-03-27   \n",
       "207               电玩城仅39.9元抢购100枚游戏币！地铁直达，全场任玩，找回儿时快乐！  2021-03-27   \n",
       "\n",
       "                                                  link  \\\n",
       "191  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "192  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "193  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "194  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "195  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "196  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "197  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "198  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "199  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "200  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "201  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "202  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "203  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "204  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "205  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "206  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "207  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "\n",
       "                                          content_text  \n",
       "191  工具条上设置固定宽高背景可以设置被包含可以完美对齐背景图和文字以及制作自己的模板工具条上设置...  \n",
       "192  微信 iOS 版更新 8.0.3 版表情包上限增加了999个！朋友圈可发30s视频！.......  \n",
       "193  抢购日期：即日-2021年4月15日2021年3月26日确认短信：购买成功后即可收到含电子码...  \n",
       "194  在外辛苦奔波一整天全身上下疲惫不堪，而鞋子又闷又挤，双脚又酸又胀回到家后，第一件事就是换上一...  \n",
       "195  你想360°沉浸体验游戏吗？你想视、听、触觉全感官刺激么？你想上天入地，穿越古今么？眼见并非...  \n",
       "196  谈起广州的购物娱乐好去处大家立刻会想到各大商圈但唯独有一个商业广场广州人谈之色变，望而却步没...  \n",
       "197  现代人每天都会刷牙但是刷牙的时长和姿势不对时间一长，口腔里不仅细菌肆意滋生还会导致各种口腔疾...  \n",
       "198  周末节假可用！免停车3小时！珠江新城性价比zui高自助餐！确认短信：购买成功后即可收到含电子...  \n",
       "199  购票软件上看IMAX厅电影1张就要80+这也太贵了吧？？吓得小编默默捂紧钱包还是来看看粉丝福...  \n",
       "200     · 这，就是厦滘  · 厦滘，位于番禺区沙滘岛的中部，占地面积1平方公里，别看面积不大...  \n",
       "201                                                     \n",
       "202  又到了黄灿灿的菠萝风靡大街的季节啦！！！菠萝迷们可以敞开了吃了！吃过那么多菠萝，你吃过云南香...  \n",
       "203  抢购价：169元/套（门市价400元）抢购日期：即日-2021年3月31日使用日期：购买次日...  \n",
       "204                                                     \n",
       "205  外省的朋友们经常觉得粤语好难但是在广州这城市每个人讲的方言都有可能不一样就算同样是广东人也有...  \n",
       "206  看电影还是得去电影院哪！之所以喜欢去还是因为电影院有的仪式感尤其对于影迷们来说出色的视听效果...  \n",
       "207  抢购价：39.9元/套（门市价150元）使用日期：即日-2021年4月15日确认短信：购买成...  "
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "6546221f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>content_text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>放假通知！再上4天班，广州人集体放假！看到最后我哭了…</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>现在，小编要告诉大家一个好消息，还有一个坏消息大家想要听哪一个呢？好吧，小编先说一个坏消息2...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>员村 | 江边3500㎡水上乐园，¥29.9起1大1小！周末节假日通用</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>前500套=29.9元（门市价：100元）500套后=39.9元（门市价：100元）使用日期...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>北京路音乐餐吧118元享双人餐！西冷牛扒+黑毛猪扒+脆皮烤鸡…张口就回本！</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>1928音乐餐厅｜浪漫西餐  套餐包含 扫码马上抢购 /// 本期导航/// 1、复古情调...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>山东红灯樱桃78元抢购3斤！个大核小，肉厚多汁！果园直发</td>\n",
       "      <td>2021-05-25</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>初夏5-6月正是吃樱桃的最好时节忍不住来一句“红了樱桃，绿了芭蕉。”听说山东临沂的红灯樱桃这...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>涨涨涨！广州5月最新房价出炉！这个区年内涨幅最大！</td>\n",
       "      <td>2021-05-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>即将进入2021上半年的最后一个月你们定好今年要实现的目标了吗？都准备在广州盘几套房？前段时...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>597</th>\n",
       "      <td>夏日带娃不用愁！精致妈妈的免洗护发神器！3分钟养出滋润秀发！</td>\n",
       "      <td>2020-08-15</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>对于很多女孩子来说，洗头不只是洗头，而是“洗头护发吹干梳理”这样一连串的漫~长过程，分分钟要...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>598</th>\n",
       "      <td>20+款解暑凉拌菜！这家10㎡的广式凉拌店，专治没胃口！</td>\n",
       "      <td>2020-08-15</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>这些日子里，我们吃饭天团美食放在面前都没了动筷的激情原因是：太热了，晒烦了，吃不下了 直到瓜...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>599</th>\n",
       "      <td>99元配防蓝光眼镜！千款镜框任你选，高空下午茶免费叹！</td>\n",
       "      <td>2020-08-14</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>对近视的盆友来说  这真的是难得一遇的大优惠啊这羊毛不薅就亏了！▼▼▼我们的老朋友一点零视光...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>600</th>\n",
       "      <td>北方人：“来广州第一次去菜市场，我被感动哭了...”</td>\n",
       "      <td>2020-08-13</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>一方水土养一方人，南方湿冷VS北方干冷、咸甜之争、搓澡VS洗澡...有关南北两地差异的纷争从...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>601</th>\n",
       "      <td>人均20！藏在永庆坊的甜品火锅，上桌冒着仙气，椰奶免费续！</td>\n",
       "      <td>2020-08-13</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...</td>\n",
       "      <td>■  ■  ■  /  夏天，糖水和冰更配  / 噔噔蹬...甜品控赶紧看过来我又有新发现...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>585 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     title create_time  \\\n",
       "0              放假通知！再上4天班，广州人集体放假！看到最后我哭了…  2021-05-25   \n",
       "1      员村 | 江边3500㎡水上乐园，¥29.9起1大1小！周末节假日通用  2021-05-25   \n",
       "2    北京路音乐餐吧118元享双人餐！西冷牛扒+黑毛猪扒+脆皮烤鸡…张口就回本！  2021-05-25   \n",
       "3             山东红灯樱桃78元抢购3斤！个大核小，肉厚多汁！果园直发  2021-05-25   \n",
       "4                涨涨涨！广州5月最新房价出炉！这个区年内涨幅最大！  2021-05-24   \n",
       "..                                     ...         ...   \n",
       "597         夏日带娃不用愁！精致妈妈的免洗护发神器！3分钟养出滋润秀发！  2020-08-15   \n",
       "598           20+款解暑凉拌菜！这家10㎡的广式凉拌店，专治没胃口！  2020-08-15   \n",
       "599            99元配防蓝光眼镜！千款镜框任你选，高空下午茶免费叹！  2020-08-14   \n",
       "600             北方人：“来广州第一次去菜市场，我被感动哭了...”  2020-08-13   \n",
       "601          人均20！藏在永庆坊的甜品火锅，上桌冒着仙气，椰奶免费续！  2020-08-13   \n",
       "\n",
       "                                                  link  \\\n",
       "0    http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "1    http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "2    http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "3    http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "4    http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "..                                                 ...   \n",
       "597  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "598  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "599  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "600  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "601  http://mp.weixin.qq.com/s?__biz=MjM5MzYxMTU2MA...   \n",
       "\n",
       "                                          content_text  \n",
       "0    现在，小编要告诉大家一个好消息，还有一个坏消息大家想要听哪一个呢？好吧，小编先说一个坏消息2...  \n",
       "1    前500套=29.9元（门市价：100元）500套后=39.9元（门市价：100元）使用日期...  \n",
       "2     1928音乐餐厅｜浪漫西餐  套餐包含 扫码马上抢购 /// 本期导航/// 1、复古情调...  \n",
       "3    初夏5-6月正是吃樱桃的最好时节忍不住来一句“红了樱桃，绿了芭蕉。”听说山东临沂的红灯樱桃这...  \n",
       "4    即将进入2021上半年的最后一个月你们定好今年要实现的目标了吗？都准备在广州盘几套房？前段时...  \n",
       "..                                                 ...  \n",
       "597  对于很多女孩子来说，洗头不只是洗头，而是“洗头护发吹干梳理”这样一连串的漫~长过程，分分钟要...  \n",
       "598  这些日子里，我们吃饭天团美食放在面前都没了动筷的激情原因是：太热了，晒烦了，吃不下了 直到瓜...  \n",
       "599  对近视的盆友来说  这真的是难得一遇的大优惠啊这羊毛不薅就亏了！▼▼▼我们的老朋友一点零视光...  \n",
       "600  一方水土养一方人，南方湿冷VS北方干冷、咸甜之争、搓澡VS洗澡...有关南北两地差异的纷争从...  \n",
       "601   ■  ■  ■  /  夏天，糖水和冰更配  / 噔噔蹬...甜品控赶紧看过来我又有新发现...  \n",
       "\n",
       "[585 rows x 4 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[~df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "a4c8d209",
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.ExcelWriter('{公众号}公众号链接及文章.xlsx'.format(公众号=公众号),mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_url_out.to_excel(writer, sheet_name=公众号)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d0f86cc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.6rc1"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
